{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Installation of transformers\n",
    "!pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import AutoTokenizer, GPT2LMHeadModel\n",
    "\n",
    "\n",
    "torch.manual_seed(12046)\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
    "model = GPT2LMHeadModel.from_pretrained('gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([[2061,  318,  262, 3139,  286, 2807,   30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# An example to use tokenizer\n",
    "question = 'What is the capital of China?'\n",
    "ids = tokenizer(question, return_tensors='pt')\n",
    "ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([ 2061,   318,   262,  3139,   286,  2807,    30,   198,   198,   464,\n",
      "         3139,   318,  2807,    13,   632,   338,   262,  4387,  3773,   287,\n",
      "          262,   995,    11,   290,   340,   468,   257,  3265,   286,   517,\n",
      "          621,   352,    13,    20,  2997,   661,    13,  2807,   318,   530,\n",
      "          286,   262, 14162,  3957, 16533,   319,  3668,    11,   351,   281,\n",
      "         5079, 12396,   286,   720,    16,    13,    17, 12989,    13,   383,\n",
      "         1499,   318,  1363,   284,   257,  1271,   286,   995,    12,  4871,\n",
      "        11155,    11,  1390,   262,  2059,   286,  3442,    11, 14727,    11,\n",
      "          262,  3999,  8581,   286,  5483, 13473,    11,  2807,   338,  2351,\n",
      "         5800,  5693,   290,   262, 21865,  8581,   329,  5800,    13,   198])\n",
      "What is the capital of China?\n",
      "\n",
      "The capital is China. It's the largest economy in the world, and it has a population of more than 1.5 billion people. China is one of the fastest growing economies on Earth, with an annual GDP of $1.2 trillion. The country is home to a number of world-class universities, including the University of California, Berkeley, the Chinese Academy of Social Sciences, China's National Science Foundation and the Shanghai Academy for Science.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# As GPT-2 is NOT very good at text generation,\n",
    "# we increase num_beams and no_repeat_ngram_size to get better generation.\n",
    "res = model.generate(**ids, max_length=100, early_stopping=True,\n",
    "                     num_beams=3, no_repeat_ngram_size=2)\n",
    "print(res[0])\n",
    "print(tokenizer.decode(res[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Template for questions and answers\n",
    "template = '''\n",
    "Q: What is the capital of the United Kingdom?\n",
    "A: London.\n",
    "\n",
    "Q: What is the capital of France?\n",
    "A: Paris.\n",
    "\n",
    "Q: %s\n",
    "A:\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Q: What is the capital of the United Kingdom?\n",
      "A: London.\n",
      "\n",
      "Q: What is the capital of France?\n",
      "A: Paris.\n",
      "\n",
      "Q: What is the capital of China?\n",
      "A:\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(template % question)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids2 = tokenizer(template % question , return_tensors='pt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Q: What is the capital of the United Kingdom?\n",
      "A: London.\n",
      "\n",
      "Q: What is the capital of France?\n",
      "A: Paris.\n",
      "\n",
      "Q: What is the capital of China?\n",
      "A:\n",
      "... Beijing.\n"
     ]
    }
   ],
   "source": [
    "# Use template to get better text generation\n",
    "res2 = model.generate(**ids2, max_length=100, early_stopping=True,\n",
    "                      num_beams=3, no_repeat_ngram_size=2)\n",
    "print(tokenizer.decode(res2[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "中国的首都在哪里？。\n",
      "\n",
      "非常和属经啊建城的自己,那么做喜址器,没有陛下解长支提队的现�\n"
     ]
    }
   ],
   "source": [
    "# The performance of GPT-2 on Chinese is NOT good\n",
    "question_zh = '中国的首都在哪里？'\n",
    "ids_zh = tokenizer(question_zh, return_tensors='pt')\n",
    "res_zh = model.generate(**ids_zh, max_length=100, early_stopping=True,\n",
    "                        num_beams=3, no_repeat_ngram_size=2)\n",
    "print(tokenizer.decode(res_zh[0], skip_special_tokens=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Q: What is the capital of the United Kingdom?\n",
      "A: London.\n",
      "\n",
      "Q: What is the capital of France?\n",
      "A: Paris.\n",
      "\n",
      "Q: 中国的首都在哪里？\n",
      "A:\n",
      "The capital city of China, Shanghai, is located in the middle of a vast expanse of land that stretches from the north to the south. It is also known as the \"Great Wall of\n"
     ]
    }
   ],
   "source": [
    "# Even using template, we can NOT get better text generation\n",
    "ids_zh2 = tokenizer(template % question_zh , return_tensors='pt')\n",
    "res_zh2 = model.generate(**ids_zh2, max_length=100, early_stopping=True,\n",
    "                         num_beams=3, no_repeat_ngram_size=2)\n",
    "print(tokenizer.decode(res_zh2[0], skip_special_tokens=True))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
